In [1]:
import pandas as pd
import numpy as np
import plotly
import networkx
import matplotlib.pyplot as plt
In [2]:
# importing module
import pandas as pd
# dataset
data = pd.read_csv("C:\\Users\\rajat.k.srivastava\\Downloads\\Market_Basket_Optimisation.csv")
# printing the shape of the dataset
data.shape
Out[2]:
(7500, 20)
In [3]:
# printing the heading
data.head()
Out[3]:
shrimp almonds avocado vegetables mix green grapes whole weat flour yams cottage cheese energy drink tomato juice low fat yogurt green tea honey salad mineral water salmon antioxydant juice frozen smoothie spinach olive oil
0 burgers meatballs eggs NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 chutney NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 turkey avocado NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 mineral water milk energy bar whole wheat rice green tea NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 low fat yogurt NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [4]:
#Top 10 items
# importing module
import numpy as np
# Gather All Items of Each Transactions into Numpy Array
transaction = []
for i in range(0, data.shape[0]):
    for j in range(0, data.shape[1]):
        transaction.append(data.values[i,j])
In [5]:
#converting to numpy array
transaction = np.array(transaction)
transaction
Out[5]:
array(['burgers', 'meatballs', 'eggs', ..., 'nan', 'nan', 'nan'],
      dtype='<U32')
In [7]:
#  Transform Them a Pandas DataFrame
df = pd.DataFrame(transaction, columns=["items"]) 
df.head()
Out[7]:
items
0 burgers
1 meatballs
2 eggs
3 nan
4 nan
In [8]:
# Put 1 to Each Item For Making Countable Table, to be able to perform Group By
df["incident_count"] = 1 
In [9]:
#  Delete NaN Items from Dataset
indexNames = df[df['items'] == "nan" ].index
df.drop(indexNames , inplace=True)
In [10]:
# Making a New Appropriate Pandas DataFrame for Visualizations  
df_table = df.groupby("items").sum().sort_values("incident_count", ascending=False).reset_index()
In [11]:
#  Initial Visualizations
df_table.head(10).style.background_gradient(cmap='Greens')
Out[11]:
  items incident_count
0 mineral water 1787
1 eggs 1348
2 spaghetti 1306
3 french fries 1282
4 chocolate 1230
5 green tea 990
6 milk 972
7 ground beef 737
8 frozen vegetables 715
9 pancakes 713
In [12]:
# importing required module
import plotly.express as px
# to have a same origin
df_table["all"] = "all" 
# creating tree map using plotly
fig = px.treemap(df_table.head(30), path=['all', "items"], values='incident_count',
                  color=df_table["incident_count"].head(30), hover_data=['items'],
                  color_continuous_scale='Greens',
                )
# ploting the treemap
fig.show()
C:\Users\rajat.k.srivastava\Anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
C:\Users\rajat.k.srivastava\Anaconda3\lib\site-packages\plotly\express\_core.py:1637: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
In [13]:
# Transform Every Transaction to Seperate List & Gather Them into Numpy Array
transaction = []
for i in range(data.shape[0]):
    transaction.append([str(data.values[i,j]) for j in range(data.shape[1])])

# creating the numpy array of the transactions
transaction = np.array(transaction)
In [14]:
# importing the required module
from mlxtend.preprocessing import TransactionEncoder

# initializing the transactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transaction).transform(transaction)
dataset = pd.DataFrame(te_ary, columns=te.columns_)

# dataset after encoded
dataset.head()
Out[14]:
asparagus almonds antioxydant juice asparagus avocado babies food bacon barbecue sauce black tea blueberries ... turkey vegetables mix water spray white wine whole weat flour whole wheat pasta whole wheat rice yams yogurt cake zucchini
0 False False False False False False False False False False ... False False False False False False False False False False
1 False False False False False False False False False False ... False False False False False False False False False False
2 False False False False True False False False False False ... True False False False False False False False False False
3 False False False False False False False False False False ... False False False False False False True False False False
4 False False False False False False False False False False ... False False False False False False False False False False

5 rows × 121 columns

In [15]:
# select top 50 items
first50 = df_table["items"].head(50).values 

# Extract Top50
dataset = dataset.loc[:,first50] 

# shape of the dataset
dataset.head()
Out[15]:
mineral water eggs spaghetti french fries chocolate green tea milk ground beef frozen vegetables pancakes ... ham energy bar energy drink pepper cereals vegetables mix muffins oil french wine fresh tuna
0 False True False False False False False False False False ... False False False False False False False False False False
1 False False False False False False False False False False ... False False False False False False False False False False
2 False False False False False False False False False False ... False False False False False False False False False False
3 True False False False False True True False False False ... False True False False False False False False False False
4 False False False False False False False False False False ... False False False False False False False False False False

5 rows × 50 columns

In [16]:
# importing the required module
from mlxtend.frequent_patterns import apriori, association_rules

# Extracting the most frequest itemsets via Mlxtend.
# The length column has been added to increase ease of filtering.
frequent_itemsets = apriori(dataset, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
# printing the frequent itemset
frequent_itemsets
Out[16]:
support itemsets length
0 0.238267 (mineral water) 1
1 0.179733 (eggs) 1
2 0.174133 (spaghetti) 1
3 0.170933 (french fries) 1
4 0.163867 (chocolate) 1
... ... ... ...
229 0.010933 (ground beef, mineral water, chocolate) 3
230 0.011067 (ground beef, mineral water, milk) 3
231 0.011067 (mineral water, milk, frozen vegetables) 3
232 0.010533 (eggs, chocolate, spaghetti) 3
233 0.010933 (spaghetti, chocolate, milk) 3

234 rows × 3 columns

In [17]:
# printing the frequntly items 
frequent_itemsets[ (frequent_itemsets['length'] == 2) &
                   (frequent_itemsets['support'] >= 0.05) ]
Out[17]:
support itemsets length
50 0.050933 (eggs, mineral water) 2
51 0.059733 (spaghetti, mineral water) 2
53 0.052667 (mineral water, chocolate) 2
In [18]:
# printing the frequntly items with length 3
frequent_itemsets[ (frequent_itemsets['length'] == 3) ].head(3)
Out[18]:
support itemsets length
217 0.014267 (eggs, mineral water, spaghetti) 3
218 0.013467 (eggs, mineral water, chocolate) 3
219 0.013067 (eggs, mineral water, milk) 3
In [19]:
#  We set our metric as "Lift" to define whether antecedents & consequents are dependent our not
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
rules["antecedents_length"] = rules["antecedents"].apply(lambda x: len(x))
rules["consequents_length"] = rules["consequents"].apply(lambda x: len(x))
rules.sort_values("lift",ascending=False)
Out[19]:
antecedents consequents antecedent support consequent support support confidence lift leverage conviction antecedents_length consequents_length
218 (ground beef) (herb & pepper) 0.098267 0.049467 0.016000 0.162822 3.291555 0.011139 1.135402 1 1
219 (herb & pepper) (ground beef) 0.049467 0.098267 0.016000 0.323450 3.291555 0.011139 1.332841 1 1
295 (ground beef) (spaghetti, mineral water) 0.098267 0.059733 0.017067 0.173677 2.907540 0.011197 1.137893 1 2
290 (spaghetti, mineral water) (ground beef) 0.059733 0.098267 0.017067 0.285714 2.907540 0.011197 1.262427 2 1
312 (olive oil) (spaghetti, mineral water) 0.065733 0.059733 0.010267 0.156187 2.614731 0.006340 1.114306 1 2
... ... ... ... ... ... ... ... ... ... ... ...
60 (eggs) (low fat yogurt) 0.179733 0.076400 0.016800 0.093472 1.223453 0.003068 1.018832 1 1
122 (escalope) (french fries) 0.079333 0.170933 0.016400 0.206723 1.209376 0.002839 1.045116 1 1
123 (french fries) (escalope) 0.170933 0.079333 0.016400 0.095944 1.209376 0.002839 1.018373 1 1
164 (shrimp) (green tea) 0.071333 0.132000 0.011333 0.158879 1.203625 0.001917 1.031956 1 1
165 (green tea) (shrimp) 0.132000 0.071333 0.011333 0.085859 1.203625 0.001917 1.015890 1 1

350 rows × 11 columns

In [20]:
# Sort values based on confidence
rules.sort_values("confidence",ascending=False)
Out[20]:
antecedents consequents antecedent support consequent support support confidence lift leverage conviction antecedents_length consequents_length
269 (eggs, ground beef) (mineral water) 0.020000 0.238267 0.010133 0.506667 2.126469 0.005368 1.544054 2 1
327 (ground beef, milk) (mineral water) 0.022000 0.238267 0.011067 0.503030 2.111207 0.005825 1.532756 2 1
321 (ground beef, chocolate) (mineral water) 0.023067 0.238267 0.010933 0.473988 1.989319 0.005437 1.448130 2 1
334 (milk, frozen vegetables) (mineral water) 0.023600 0.238267 0.011067 0.468927 1.968075 0.005444 1.434328 2 1
34 (soup) (mineral water) 0.050533 0.238267 0.023067 0.456464 1.915771 0.011026 1.401441 1 1
... ... ... ... ... ... ... ... ... ... ... ...
46 (mineral water) (red wine) 0.238267 0.028133 0.010933 0.045887 1.631053 0.004230 1.018607 1 1
313 (mineral water) (spaghetti, olive oil) 0.238267 0.022933 0.010267 0.043089 1.878880 0.004802 1.021063 1 2
49 (mineral water) (cereals) 0.238267 0.025733 0.010267 0.043089 1.674442 0.004135 1.018137 1 1
272 (mineral water) (eggs, ground beef) 0.238267 0.020000 0.010133 0.042529 2.126469 0.005368 1.023530 1 2
277 (mineral water) (spaghetti, french fries) 0.238267 0.027600 0.010133 0.042529 1.540920 0.003557 1.015593 1 2

350 rows × 11 columns

In [ ]:
 
In [ ]: